Functions and variables

misc

underscore_to_space <- partial(str_replace_all, pattern = "_", replacement = " ")
oldest_node_tree <- trimmed_species_tree_as_df %>% pull(x) %>% max()

# labels for plot axis
max_msa_index <- kappa_caseins_alignment_positions %>% pull(msa_index) %>% max()
align_breaks <- c(
  1, # min
  max_msa_index, # max
  seq(100, max_msa_index, 100), # intermediate positions
  position_align_chymosin_cleavage_site + 0.5 # cleavage site
)
align_breaks_labels <- c(
  1, # min
  max_msa_index, # max
  seq(100, max_msa_index, 100), # intermediate positions
  "â–²" # cleavage site
) %>% as.character()
align_minor_breaks <- c(1, max_msa_index, seq(25, max_msa_index, 25))

export plots and tables

save_plot_pdf <- partial(
  ggsave, 
  path = plot_output_dir, 
  device = cairo_pdf, 
  units = "in"
)
save_plot_jpg <- partial(
  ggsave, 
  path = plot_output_dir, 
  dpi = "retina", 
  units = "in"
)
save_plot <- function(plot, name, ...) {
  suppressMessages(
    save_plot_pdf(plot = plot, filename = glue("{name}.pdf"), ...)
    )
  suppressMessages(
    save_plot_jpg(plot = plot, filename = glue("{name}.jpg"), ...)
    )
}
save_table <- function(table, name, ...) {
  write(x = table, file = file.path(table_output_dir, glue("{name}.tex")), ...)
}

colours

Custom amino acid colours based on the colourblind friendly OkabeIto colour palette (see http://jfly.iam.u-tokyo.ac.jp/color/ and https://github.com/clauswilke/colorblindr)

aa_custom_OkabeIto_colours <- c(
  "cysteine" = "#E69F00", # orange
  "histidine" = "#56B4E9", # light blue
  "charged +" = "#0072B2", # marine blue
  "charged -" = "#D55E00", # red
  "aromatic" = "#CC79A7", # pink
  "proline" = "#F0E442", # yellow
  "polar" = "#009E73", # green
  "apolar" = "#999999", # light grey
  "glycine" = "#999999", # light grey
  "gap" = "#ffffff" # white
)

Kappa-casein parts colours

colour_parts_kappa_casein <- c(
  "PKC" = "#e69f00", # orange
  "GMP" = "#56b4e9" # blue
)

Figure 1

# custom geom to be used for annotation with consistent style
geom_cladelabel_custom <- partial(
  geom_cladelabel,
  fontsize = 2.4,
  offset = 0.5,
  offset.text = 0.1,
  family = "arial",
  barsize = 0.4
)

# geological time periods
# package geoscale
# Gradstein, F. M., Ogg, J. M., and Schmitz, M., 2012, A geologic time scale, Boston, USA, Elsevier.
data(timescales)

mammals_periods <- timescales %>%
  purrr::pluck("ICS2015") %>% 
  filter(End < oldest_node_tree, !is.na(Part_of)) %>%
  rename(geologic.period = Part_of) %>%
  group_by(geologic.period) %>%
  summarise(start = max(Start), end = min(End)) %>%
  ungroup() %>%
  mutate(geologic.period = fct_reorder(geologic.period, end)) %>%
  arrange(desc(geologic.period)) %>%
  identity()

mammals_periods_breaks <- c(mammals_periods$start) %>% round(1)

# Panel A - maximum likelihood protein tree
plot_fitted_tree <- fitted_tree %>%
  ggtree(size = 0.1, color = "black") +
  xlim_tree(3.2) +
  geom_treescale(
    x = 0.1,
    y = 30,
    offset = 2,
    fontsize = 2.5,
    linesize = 0.4,
    width = 0.1
  ) +
  geom_tiplab(
    size = 0.7,
    mapping = aes(label = underscore_to_space(label)),
    family = "arial",
    fontface = "italic"
  ) +
  scale_x_continuous(expand = c(0.02, 0)) +
  scale_y_continuous(expand = c(0.02, 0)) +
  geom_cladelabel_custom(node = 101, label = "Monotremes") +
  geom_cladelabel_custom(node = 196, label = "Marsupials") +
  geom_cladelabel_custom(node = 194, label = "Afrotheria") +
  geom_cladelabel_custom(node = 106, label = "Glires") +
  geom_cladelabel_custom(node = 186, label = "Bats") +
  geom_cladelabel_custom(node = 139, label = "New world monkeys") +
  geom_cladelabel_custom(node = 126, label = "Old world monkeys") +
  geom_cladelabel_custom(node = 134, label = "Apes") +
  geom_cladelabel_custom(node = 175, label = "Carnivorans") +
  geom_cladelabel_custom(node = 149, label = "Ruminantia") +
  geom_cladelabel_custom(node = 164, label = "Cetaceans") +
  geom_cladelabel_custom(node = 169, label = "Camelids") +
  geom_cladelabel_custom(node = 172, label = "Odd-toed ungulates") +
  labs(
    tag = "A"
  ) +
  NULL



# Panel B- scatter plot
plot_compare_evodist_divtime <- compare_evodist_divtime %>%
  filter(corrected == TRUE) %>%
  ggplot(
    mapping = aes(
      x = divergence.time,
      y = distance.fit,
      label = glue("{species.1}-{species.2}") # dev only
    )
  ) +
  geom_vline( # separates geological periods
    xintercept = mammals_periods_breaks,
    colour = "darkgrey",
    linetype = "dotted", size = 0.5
  ) +
  geom_point( # couples of species
    size = 0.7,
    alpha = 0.1,
    colour = "black"
  ) +
  geom_text( # annotate geological periods at the top
    data = mammals_periods %>% filter(geologic.period != "Quaternary"),
    mapping = aes(label = geologic.period, x = start + (end - start) / 2),
    y = Inf,
    family = "arial",
    vjust = 1.4,
    size = 2.5
  ) +
  scale_x_continuous(
    limits = c(0, max(mammals_periods_breaks)),
    minor_breaks = seq(0, 200, 10),
    sec.axis = sec_axis(~., breaks = mammals_periods_breaks),
    expand = c(0.01, 0)
  ) +
  labs(
    x = "Divergence Time (Mya)",
    y = "Pairwise distance",
    tag = "B"
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text = element_text(size = 7),
    axis.title = element_text(size = 9),
    axis.ticks.x = element_line(size = 0.2),
    panel.border = element_rect(fill = NA, colour = "black", size = 0.4)
  ) +
  NULL



(plot_compare_evodist_divtime_combined <- plot_fitted_tree +
  plot_compare_evodist_divtime +
  plot_layout(widths = c(2.8, 5.2)) &
  theme(
    plot.tag.position = c(0, 1),
    text = element_text(family = "arial", face = "plain")
  )
)

save_plot(
  plot = plot_compare_evodist_divtime_combined,
  name = "figure_1"
)

Figure 2

# equal expand values for the y axis to assure the 2 plots align well
indels_kappacasein_parts_plot_expand <- c(0.005, 0.0)


# species tree on the left
plot_kappa_casein_tree <- ggplot(trimmed_species_tree, size = 0.5) +
  geom_tree(size = 0.2) +
  geom_treescale(x = 100, y = 90, fontsize = 3, linesize = 0.2) +
  theme_bw() +
  theme(
    plot.margin = margin(0, 0, 0, 0),
    panel.spacing = margin(0, 5, 5, 5),
    panel.border = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text = element_blank(),
    axis.title = element_blank(),
    axis.ticks = element_blank(),
  ) +
  scale_y_continuous(expand = indels_kappacasein_parts_plot_expand) +
  scale_x_continuous(expand = c(0, 0)) +
  xlim_tree(167)


# length of PKC and GMP + protein tandem repeats
bidirectional_breaks <- c(
  range(parts_positions_neg$position), # min/max lengths
  seq(0, 200, by = 75),
  seq(0, -200, by = -75)
) %>% sort()


plot_indels_kappacasein_parts <- ggplot( # first the length bar plot
  data = parts_positions_neg,
  mapping = aes(x = position, y = species, fill = part)
) +
  geom_barh(stat = "identity") +
  geom_rect( # then the PTRs annotations
    data = df_plot_indels,
    mapping = aes(
      ymin = as.numeric(species) - 0.4,
      ymax = as.numeric(species) + 0.4,
      xmin = N_flank_position,
      xmax = C_flank_position + 1,
      colour = type
    ),
    inherit.aes = FALSE,
    alpha = 0.0,
    size = 0.25
  ) + # add duplications
  geom_vline(xintercept = 0, size = 0.1, colour = "black") +
  scale_fill_manual(values = c(colour_parts_kappa_casein)) +
  scale_colour_manual(values = c("tandem repeat" = "black")) +
  scale_y_discrete(
    expand = indels_kappacasein_parts_plot_expand,
    position = "right",
    labels = underscore_to_space
  ) +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = bidirectional_breaks,
    minor_breaks = seq(-200, 200, 25),
    labels = abs(bidirectional_breaks),
    sec.axis = sec_axis(~.,
      breaks = c(0),
      labels = c("PKC /GMP cleavage site")
    )
  ) +
  theme_bw() +
  theme(
    legend.position = c(1, 1),
    legend.justification = c(1, 1),
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_text(size = 7, face = "italic"),
    axis.ticks = element_line(size = 0.2),
    legend.title = element_blank(),
    legend.text = element_text(margin = margin(0, 0, 0, 2), size = 9),
    plot.margin = margin(0, 0, 0, 0),
    panel.spacing = margin(0, 5, 5, 0),
    legend.background = element_rect(fill = "transparent"),
    legend.box.background = element_rect(
      colour = "black",
      fill = "white",
      size = 0.1
    ),
    legend.spacing = unit(1, "mm"),
    legend.key = element_blank(),
    panel.background = element_rect(fill = "white", colour = "white"),
    plot.background = element_rect(fill = "white", colour = "white"),
    legend.box.spacing = unit(0.01, "line"),
    legend.key.size = unit(5, "pt"),
    panel.border = element_rect(
      fill = "transparent",
      colour = "black",
      size = 0.2
    )
  ) +
  labs(
    x = "distance from the PKC/GMP cleavage site"
  )


(plot_combined_tree_lengths <- plot_kappa_casein_tree +
  plot_indels_kappacasein_parts +
  plot_layout(widths = c(0.5, 2), ncol = 2)
)

save_plot(
  plot = plot_combined_tree_lengths,
  name = "figure_2"
)

Figure 3

(plot_aa_comp_range <- kappa_caseins_aa_comp_df_plot %>%
  ggplot(mapping = aes(
    x = freq_min,
    y = part,
    xend = freq_max,
    yend = part,
    colour = group
  )) +
  geom_segment(show.legend = FALSE, size = 5) +
  facet_wrap(~residues_f, ncol = 5, nrow = 4) +
  scale_colour_manual("amino acids", values = aa_custom_OkabeIto_colours) +
  theme_bw() +
  scale_x_continuous(limits = c(0, NA), expand = c(0, 0)) +
  labs(
    x = "amino acid frequency"
  ) +
  theme(
    text = element_text(colour = "black", family = "Arial"),
    axis.text = element_text(colour = "black", family = "Arial"),
    axis.ticks = element_line(size = 0.1),
    axis.text.x = element_text(),
    axis.title.y = element_blank(),
    panel.grid = element_blank(),
    panel.grid.major = element_line(linetype = "dotted"),
    panel.grid.major.x = element_line(size = 0.1, colour = "black"),
    panel.grid.minor.x = element_line(size = 0.05, colour = "darkgrey"),
    panel.border = element_rect(size = 0.2, colour = "black"),
    strip.background = element_rect(fill = "white", size = 0.0, colour = NA),
    strip.text = element_text(hjust = 0, size = 10),
    panel.spacing.x = unit(1, "lines")
  ) +
  NULL
)

save_plot(
  plot = plot_aa_comp_range,
  name = "figure_3"
)

Figure 4

# scales and theme ----
scale_x_align_residue_plot <- scale_x_continuous(
  expand = c(0, 0),
  breaks = align_breaks,
  labels = align_breaks_labels,
  minor_breaks = align_minor_breaks
)

scale_y_align_residue_plot <- scale_y_continuous(
  expand = c(0, 0),
  breaks = c(0, 0.5, 1)
)

theme_align_residue_plot <- theme_bw() +
  theme(
    legend.position = "top",
    legend.title = element_blank()
  )

# names and labels ----
label_other_residues <- "other residues"
label_glutamine <- "glutamine"
label_asparagine <- "asparagine"
label_tyrosine <- "tyrosine"
label_other_aromatic <- "other aromatic (F or W)"
label_charged_minus <- "negatively charged (D or E)"
label_charged_plus <- "positively charged (R, K or H)"
label_pSer <- "pSer"
label_Ser <- "Ser"
label_Ch <- "goat" # Capra hircus
label_BtCh <- "cow & goat" # Bos taurus and Capra hircus

# other residues diplayed in grey ----
colour_other_residues <- c()
colour_other_residues[label_other_residues] <- "#e7e7e7"

# colours for each plot ----
colours_asngln <- colour_other_residues
colours_asngln[label_glutamine] <- "#461b32"
colours_asngln[label_asparagine] <- "#009E73"

colours_aromatics <- colour_other_residues
colours_aromatics[label_other_aromatic] <- "#CC79A7"
colours_aromatics[label_tyrosine] <- "#461b32"

colours_charged <- colour_other_residues
colours_charged[label_charged_minus] <- "#e41a1c"
colours_charged[label_charged_plus] <- "#377eb8"


colours_phosphorylations <- colour_other_residues
colours_phosphorylations[label_pSer] <- "#247a62"
colours_phosphorylations[label_Ser] <- "#8da0cb"
colours_known_pSer <- c()
colours_known_pSer[label_BtCh] <- "#a65628"
colours_known_pSer[label_Ch] <- "#4daf4a"

# vertical line at PKC/GMP cleavage site
vline_cleavage <- geom_vline(
  xintercept = position_align_chymosin_cleavage_site + 0.5,
  colour = "black",
  size = 0.2
)


# weight residues according to species tree ----
fct_weight_conservation <- . %>%
  select(-seq_index, -residue) %>%
  left_join(kappa_caseins_phylo_tree_weights, by = "species") %>%
  group_by(msa_index, class) %>%
  summarise(
    n_w = sum(weight_norm)
  ) %>%
  ungroup()

# hydrophobicity legend ----
kd_range_max <- max(KyteDoolittle_scale_df$hydrophobicity)
kd_range_min <- min(KyteDoolittle_scale_df$hydrophobicity)
hydrophobicity_breaks <- c(kd_range_min, 2, 0, -2, kd_range_max)

# known phosphorylated serines ----
# from Uniprot
# goat: https://www.uniprot.org/uniprot/P02670#ptm_processing
# cow: https://www.uniprot.org/uniprot/P02668#ptm_processing
known_pSer_site_df <- tribble(
  ~msa_index, ~label,
  355, label_Ch,
  294, label_BtCh,
  179, label_BtCh
)

# Panel A - glutamine and asparagine ----

# dataset
df_plot_asngln <- kappa_caseins_alignment_positions %>%
  filter(residue != "-") %>%
  mutate(
    class = case_when(
      residue == "Q" ~ label_glutamine,
      residue == "N" ~ label_asparagine,
      TRUE ~ label_other_residues
    ) %>%
      fct_relevel(label_other_residues, label_asparagine, label_glutamine) # order colours for plot
  ) %>%
  fct_weight_conservation()

# plot
plot_alignment_asngln <- df_plot_asngln %>%
  ggplot(mapping = aes(y = n_w, x = msa_index, fill = class)) +
  geom_bar(stat = "identity", colour = NA) +
  scale_fill_manual(
    values = colours_asngln,
    guide = guide_legend(reverse = TRUE)
  ) +
  theme_align_residue_plot +
  NULL



# Panel B - tyrosines and aromatics ----

# dataset
df_plot_aromatics <- kappa_caseins_alignment_positions %>%
  filter(residue != "-") %>%
  mutate(
    class = case_when(
      residue == "Y" ~ label_tyrosine,
      residue %in% c("F", "W") ~ label_other_aromatic,
      TRUE ~ label_other_residues
    ) %>%
      fct_relevel(label_other_residues, label_other_aromatic, label_tyrosine) # order colours for plot
  ) %>%
  fct_weight_conservation()


# plot
plot_alignment_aromatics <- df_plot_aromatics %>%
  ggplot(mapping = aes(y = n_w, x = msa_index, fill = class)) +
  geom_bar(stat = "identity", colour = NA) +
  scale_fill_manual(
    values = colours_aromatics,
    guide = guide_legend(reverse = TRUE)
  ) +
  theme_align_residue_plot +
  NULL



# Panel C - charged residues ----

# dataset
df_plot_charged <- kappa_caseins_alignment_positions %>%
  filter(residue != "-") %>%
  mutate(
    class = case_when(
      residue %in% c("R", "K", "H") ~ label_charged_plus,
      residue %in% c("E", "D") ~ label_charged_minus,
      TRUE ~ label_other_residues
    ) %>%
      fct_relevel(label_other_residues, label_charged_plus, label_charged_minus) # order colours for plot
  ) %>%
  fct_weight_conservation()

# plot
plot_alignment_charged <- df_plot_charged %>%
  ggplot(mapping = aes(y = n_w, x = msa_index, fill = class)) +
  geom_bar(stat = "identity", colour = NA) +
  scale_fill_manual(
    values = colours_charged,
    guide = guide_legend(reverse = TRUE)
  ) +
  theme_align_residue_plot +
  NULL


# Panel D - predicted disorder ----

# dataset
df_plot_disorder <- kappa_casein_disorder_align %>%
  filter(!is.na(seq_index)) %>%
  mutate(
    class = cut(iupred_short, breaks = seq(0, 1, 0.2))
  ) %>%
  fct_weight_conservation()


# plot
plot_alignment_disorder <- df_plot_disorder %>%
  ggplot(mapping = aes(y = n_w, x = msa_index, fill = class)) +
  geom_bar(stat = "identity", colour = NA) +
  scale_fill_brewer(palette = "Greens") +
  theme_bw() +
  theme(
    legend.position = "top",
    legend.title = element_text(size = 8)
  ) +
  labs(fill = "predicted disorder") +
  NULL


# Panel E - hydrophobicity ----

# dataset
df_plot_hydrophobicity <- kappa_caseins_hydrophobicity %>%
  filter(residue != "-") %>%
  rename(class = hydrophobicity) %>%
  fct_weight_conservation()

# plot
plot_alignment_hydrophobicity <- df_plot_hydrophobicity %>%
  ggplot(mapping = aes(y = n_w, x = msa_index, fill = class)) +
  geom_bar(stat = "identity", colour = NA) +
  scale_fill_distiller(
    palette = "BrBG",
    limits = c(kd_range_min, kd_range_max),
    direction = 1,
    breaks = hydrophobicity_breaks,
    guide = guide_colourbar(ticks.colour = "black", barwidth = 12)
  ) +
  theme_bw() +
  theme(
    legend.position = "top",
    legend.title = element_text(size = 8)
  ) +
  labs(fill = "hydrophobicity") +
  NULL


# Panel F - phosphorylations ----

# dataset
df_plot_phosphorylations <- kappa_casein_fam20c_matches_tidy %>%
  mutate(match = TRUE) %>%
  rename(seq_index = match_position) %>%
  full_join(kappa_caseins_alignment_positions, by = c("species", "seq_index")) %>%
  mutate(match = !is.na(match)) %>%
  filter(residue != "-") %>%
  mutate(
    class = case_when(
      match == TRUE ~ label_pSer,
      residue == "S" ~ label_Ser,
      TRUE ~ label_other_residues
    ) %>%
      fct_relevel(label_other_residues, label_Ser, label_pSer) # order colours for plot
  ) %>%
  fct_weight_conservation()

# plot
plot_alignment_phosphorylations <- df_plot_phosphorylations %>%
  ggplot(mapping = aes(y = n_w, x = msa_index, fill = class)) +
  geom_bar(stat = "identity", colour = NA) +
  geom_text(data = known_pSer_site_df, mapping = aes(x = msa_index, colour = label), y = 1, vjust = 0.5, label = "â–¾", inherit.aes = FALSE, size = 4) +

  scale_fill_manual(
    values = colours_phosphorylations,
    guide = guide_legend(reverse = TRUE)
  ) +
  theme_align_residue_plot +
  NULL


# combine all plots ----

(plot_alignment_combined <- {
  (plot_alignment_asngln + labs(tag = "A")) /
    (plot_alignment_aromatics + labs(tag = "B")) /
    (plot_alignment_charged + labs(tag = "C")) /
    (plot_alignment_disorder + labs(tag = "D")) /
    (plot_alignment_hydrophobicity + labs(tag = "E")) /
    (plot_alignment_phosphorylations + labs(tag = "F"))
} + plot_layout(ncol = 1) &
  vline_cleavage & # following scales, themes and labs applied to all subplots
  scale_x_align_residue_plot &
  scale_y_align_residue_plot &
  theme(
    text = element_text(family = "Arial", colour = "black"),
    legend.text = element_text(size = 7),
    legend.box.spacing = unit(0.01, "line"),
    axis.text = element_text(size = 7, colour = "black"),
    axis.title = element_text(size = 7),
    panel.border = element_rect(size = 0.2),
    axis.ticks = element_line(size = 0.2),
    legend.key.size = unit(5, "pt"),
    plot.tag.position = c(0, 1),
    plot.tag = element_text(colour = "black", size = 11),
    legend.justification = c(0, 0),
  ) &
  labs(
    x = "amino acid position in alignment",
    y = "weighted frequency"
  )
)

save_plot(
  plot = plot_alignment_combined,
  name = "figure_4"
)

Figure 5

pH_breaks <- c(3, 4, 5, 6, 7)
pH_minor_breaks <- seq(1.5, 7.5, 0.5)
ncpr_highlighted_species <- c(
  "Saimiri_boliviensis",
  "Cavia_porcellus",
  "Homo_sapiens",
  "Mus_musculus",
  "Bos_taurus"
)
geom_line_highlight_species <- partial(geom_line, mapping = aes(colour = species), size = 0.3)

# panel A
plot_ncpr_change_mature <- kappa_caseins_ncpr %>%
  filter(part == "mature") %>%
  ggplot(mapping = aes(x = pH, y = net_charge_per_residue, group = species)) +
  geom_hline(yintercept = 0, size = 0.1, linetype = "dotted") +
  geom_line(size = 0.1, colour = "lightgray") +
  facet_wrap(~part) +
  scale_y_continuous(
    limits = c(-0.21, 0.21),
    sec.axis = sec_axis(~.)
  ) +
  scale_x_continuous(
    breaks = pH_breaks,
    minor_breaks = pH_minor_breaks,
    expand = c(0, 0)
  ) +
  scale_color_brewer(
    palette = "Set1",
    labels = underscore_to_space,
    guide = guide_legend(
      nrow = 2,
      title.hjust = 0,
      label.hjust = 0,
      title.vjust = 1,
      label.vjust = 0.5,
      byrow = TRUE,
      direction = "horizontal"
    )
  ) +
  labs(
    y = "net charge per residue",
    x = "pH"
  ) +
  theme_bw() +
  theme(
    legend.title = element_blank(),
    legend.text.align = 0,
    legend.text = element_text(face = "italic", size = 8),
    legend.key.size = unit(10, "pt"),
    legend.position = "bottom",
    legend.box.spacing = unit(0.03, "line"),
    legend.justification = c(1, 0),
    panel.background = element_rect(size = 0.2, colour = "black"),
    panel.grid.major = element_line(size = 0.2),
    plot.tag.position = c(0, 1),
    strip.background = element_blank(),
    panel.spacing = unit(1.2, "line"),
    axis.text = element_text(size = 9, colour = "black"),
    axis.title = element_text(size = 10, colour = "black"),
    axis.ticks = element_line(size = 0.2, colour = "black")
  ) +
  NULL

# panel B
# same plot, different data
plot_ncpr_change_parts <- plot_ncpr_change_mature %+%
  (kappa_caseins_ncpr %>% filter(part != "mature")) &
  theme(legend.position = "none")

# combination of the 2 plots
# highlight species of interests
(plot_charge_combined <- (
  plot_ncpr_change_mature +
    geom_line_highlight_species(
      data = subset(
        kappa_caseins_ncpr,
        part == "mature" &
          species %in% ncpr_highlighted_species
      )
    ) +
    labs(tag = "A")
) +
  (
    plot_ncpr_change_parts +
      geom_line_highlight_species(
        data = subset(
          kappa_caseins_ncpr,
          part != "mature" &
            species %in% ncpr_highlighted_species
        )
      ) +
      labs(tag = "B")
  ) +
  plot_layout(widths = c(1, 2))
)

save_plot(
  plot = plot_charge_combined,
  name = "figure_5"
)

Figure 6

# merge predicted count phosphorylations/o-glycosylations
df_phospho_glyco_counts <- bind_rows(
  phosphorylation = kappa_casein_fam20c_counts,
  Oglycosylation = kappa_casein_oglyc_glycomine_counts,
  .id = "PTM"
) %>%
  order_species_tree() %>%
  order_cleavage_parts() %>%
  mutate(n = if_else(part == "PKC", -n, n)) # uses negative values for PKC

# plot phosphorylations
plot_phospho_counts <- df_phospho_glyco_counts %>%
  filter(PTM == "phosphorylation") %>%
  ggplot(mapping = aes(x = n, y = species, fill = part)) +
  geom_barh(stat = "identity") +
  scale_fill_manual(values = c(colour_parts_kappa_casein)) +
  scale_y_discrete(expand = c(0.005, 0.0), position = NULL) +
  scale_x_continuous(
    expand = c(0.005, 0),
    limits = c(-8, 8),
    breaks = seq(-8, 8, 2),
    minor_breaks = seq(-8, 8, 1),
    labels = c(seq(8, 2, -2), 0, seq(2, 8, 2)),
    sec.axis = dup_axis()
  ) +
  theme_bw() +
  theme(
    axis.title = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    legend.position = c(0, 1),
    legend.justification = c(0, 1),
    legend.title = element_blank(),
    legend.background = element_rect(size = 0.1, colour = "black"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(size = 8)
  ) +
  labs(
    title = "phosphorylations predictions"
  ) +
  NULL

# plot glycosylations
plot_glyco_counts <- df_phospho_glyco_counts %>%
  filter(PTM == "Oglycosylation") %>%
  ggplot(mapping = aes(x = n, y = species, fill = part)) +
  geom_barh(stat = "identity") +
  scale_fill_manual(values = c(colour_parts_kappa_casein)) +
  scale_y_discrete(
    expand = c(0.005, 0.0),
    position = "right",
    labels = underscore_to_space
  ) +
  scale_x_continuous(
    expand = c(0.005, 0),
    limits = c(-20, 20),
    breaks = seq(-20, 20, 5),
    minor_breaks = seq(-20, 20, 1),
    labels = c(seq(20, 5, -5), 0, seq(5, 20, 5)),
    sec.axis = dup_axis()
  ) +
  theme_bw() +
  theme(
    axis.title = element_blank(),
    legend.position = "none",
    axis.text.y = element_text(face = "italic", size = 6),
  ) +
  labs(
    title = "O-glycosylation predictions"
  ) +
  NULL

plot_phospho_glyco_counts <- {
  plot_kappa_casein_tree + {
    {
      plot_phospho_counts + plot_glyco_counts
    } &
      theme(
        plot.margin = margin(0, 3, 0, 3),
        panel.border = element_rect(size = 0.1),
        axis.title.y = element_blank(),
        axis.text.x = element_text(size = 6),
        axis.ticks = element_line(size = 0.1, lineend = "butt"),
        plot.title = element_text(size = 8),
        plot.subtitle = element_blank()
      ) &
      geom_vline(xintercept = 0, size = 0.05, colour = "black")
  }
} +
  plot_layout(widths = c(0.7, 4)) +
  theme(
    plot.margin = margin(0, 0, 0, 0)
  )

save_plot(
  plot = plot_phospho_glyco_counts,
  name = "figure_6"
)

Figure 7

all_cysteines_positions_rel_cattle <- kappa_caseins_cysteines_positions_rel_cattle %>%
  pull(cattle_position) %>%
  unique()
SS_bonds_categories <- kappa_caseins_SS_bonds_classes %>%
  pull(bond) %>%
  levels()

# kappa-casein residues for every position with a cysteine in a species
plot_kappa_casein_cysteines <- kappa_caseins_cysteines_positions_rel_cattle %>%
  mutate(position_id = group_indices(., cattle_position)) %>% # to simulate continuous axis
  ggplot(mapping = aes(
    x = position_id,
    y = species,
    label = residue,
    fill = group
  )) +
  geom_tile() +
  geom_vline(xintercept = seq(1.5, 11.5, 1), colour = "black", size = 0.1) +
  geom_text(family = "Source Code Pro", size = 2) +
  scale_fill_manual(values = aa_custom_OkabeIto_colours) +
  theme_bw() +
  theme(legend.position = "bottom") +
  scale_x_continuous(
    breaks = 1:12,
    labels = all_cysteines_positions_rel_cattle,
    expand = c(0, 0),
    sec.axis = dup_axis()
  ) +
  scale_y_discrete(
    expand = c(0, 0)
  ) +
  guides(fill = guide_legend(nrow = 9, ncol = 1)) +
  theme(
    axis.ticks.y = element_blank(),
    axis.text.y = element_blank(),
    legend.direction = "vertical",
    legend.position = c(0.7, 0),
    legend.key.size = unit(2, "pt"),
    legend.title = element_text(size = 5),
    legend.text = element_text(size = 5),
    legend.box.margin = margin(2, 2, 2, 2, unit = "pt"),
    legend.margin = margin(2, 2, 2, 2, unit = "pt"),
    legend.background = element_rect(fill = "transparent"),
    legend.box.background = element_rect(
      colour = "black",
      fill = "white",
      size = 0.1
    ),
    legend.spacing = unit(0.5, "mm"),
    legend.justification = c(.5, 0)
  ) +
  labs(
    x = "position",
    fill = "amino acids"
  ) +
  NULL



# possibilities to form dimer/oligo/intrachain SS bonds
plot_SS_bonds <- kappa_caseins_SS_bonds_classes %>%
  filter(possibility) %>%
  arrange(bond) %>%
  mutate(bond_id = group_indices(., bond)) %>% # to simulate continuous axis
  ggplot(mapping = aes(x = bond_id, y = species)) +
  geom_text(label = "\u2713", size = 4) + # draw a tick
  geom_vline(xintercept = c(1.5, 2.5), size = 0.1, colour = "black") +
  scale_y_discrete(
    position = "right",
    labels = underscore_to_space,
    drop = FALSE
  ) +
  scale_x_continuous(
    breaks = 1:3,
    labels = SS_bonds_categories,
    sec.axis = dup_axis(),
    expand = c(0.25, 0)
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.text.y = element_text(size = 5, face = "italic"),
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.border = element_rect(size = 0.1, colour = "black"),
    panel.grid.major.x = element_blank()
  ) +
  NULL

(plot_combined_cysteines <- {
  plot_kappa_casein_tree + {
    {
      plot_kappa_casein_cysteines + plot_SS_bonds
    } +
      plot_layout(widths = c(3.5, 1)) &
      theme(
        axis.title = element_blank(),
        axis.text.x = element_text(size = 5),
        axis.ticks = element_line(size = 0.2),
        panel.border = element_rect(size = 0.1, colour = "black"),
        panel.grid.major.x = element_blank(),
        plot.margin = margin(0, 0.5, 0, 0),
      )
  }
} +
  plot_layout(widths = c(0.5, 3.5)) +
  NULL
)

save_plot(
  plot = plot_combined_cysteines,
  name = "figure_7"
)

Figure 8

# base plots for this figure and some supplementary figures
plot_kappa_caseins_parts <- kappa_casein_parts_lengths %>%
  mutate(part = fct_relevel(part, "GMP")) %>%
  ggplot(mapping = aes(x = length + 0.5, y = species, fill = part)) +
  geom_barh(stat = "identity") +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = c(1, seq(50, 300, 50)),
    minor_breaks = seq(0, 300, 25)
  ) +
  scale_y_discrete(
    position = "right",
    labels = underscore_to_space,
    expand = c(0, 0)
  ) +
  scale_fill_manual(values = c(colour_parts_kappa_casein)) +
  theme_bw() +
  theme(
    legend.title = element_blank(),
    legend.position = c(1, 1),
    legend.justification = c(1, 1),
    legend.background = element_rect(fill = "transparent"),
    legend.box.background = element_rect(colour = "black", fill = "white", size = 0.2),
    legend.spacing = unit(1, "mm"),
    legend.key = element_blank(),
    legend.box.spacing = unit(0.01, "line"),
    legend.key.size = unit(5, "pt"),
    legend.text = element_text(margin = margin(0, 0, 0, 2), size = 8),
    axis.ticks = element_line(size = 0.2),
    axis.text.y = element_text(size = 7, face = "italic"),
    text = element_text(family = "Arial"),
    axis.title.x = element_text(size = 8),
    axis.text.x = element_text(size = 7),
    panel.background = element_rect(fill = "white", colour = "white"),
    plot.background = element_rect(fill = "white", colour = "white"),
    panel.border = element_rect(fill = "transparent", colour = "black", size = 0.2),
    axis.title.y = element_blank(),
    plot.title = element_text(size = 9),
    plot.subtitle = element_blank(),
    panel.grid.major.x = element_line(size = 0.2),
    panel.grid.minor.x = element_blank()
  ) +
  labs(x = "amino acid position") +
  NULL


# data sets
pepsin_positions <- kappa_caseins_cleavage_positions %>%
  filter(peptidase == "pepsin")
plasmin_positions <- kappa_caseins_cleavage_positions %>%
  filter(peptidase == "plasmin")

plot_kappa_casein_pepsin <- plot_kappa_caseins_parts +
  geom_segment(
    size = 0.1,
    data = pepsin_positions,
    mapping = aes(
      x = cleavage_position + 0.5,
      xend = cleavage_position + 0.5,
      y = as.integer(species) - 0.5,
      yend = as.integer(species) + 0.5
    ),
    inherit.aes = FALSE
  ) +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(title = "Pepsin")

plot_kappa_casein_plasmin <- plot_kappa_caseins_parts +
  geom_segment(
    size = 0.1,
    data = plasmin_positions,
    mapping = aes(
      x = cleavage_position + 0.5,
      xend = cleavage_position + 0.5,
      y = as.integer(species) - 0.5,
      yend = as.integer(species) + 0.5
    ),
    inherit.aes = FALSE
  ) +
  labs(title = "Plasmin")

(cleavage_sites_position_plot <- {
  plot_kappa_casein_tree +
    plot_kappa_casein_pepsin +
    plot_kappa_casein_plasmin
} +
  plot_layout(width = c(1, 4, 4)) &
  theme(plot.margin = margin(0, 0, 0, 0))
)

save_plot(
  plot = cleavage_sites_position_plot,
  name = "figure_8"
)

Figure S1

# positions clades
clades_to_annotate <- tribble(
  ~node, ~clade,
  101, "monotremes",
  197, "marsupials",
  103, "placentals"
) %>%
  left_join(trimmed_species_tree_as_df, by = "node") %>%
  select(clade, branch, y)

plot_kappa_casein_tree_clades <- plot_kappa_casein_tree +
  geom_text(
    data = clades_to_annotate,
    mapping = aes(x = branch, y = y, label = clade),
    vjust = -0.3,
    size = 2.8,
    hjust = 0.5
  )

plot_msa <- kappa_caseins_alignment_positions %>%
  filter(residue != "-") %>%
  left_join(amino.acid.classifications %>%
    filter(classification == "custom: charge and aromatic"),
  by = c("residue" = "residue")
  ) %>%
  ggplot(mapping = aes(x = msa_index, y = species, fill = class)) +
  geom_tile() +
  scale_fill_manual(values = aa_custom_OkabeIto_colours) +
  scale_x_continuous(
    expand = c(0, 0),
    breaks = align_breaks,
    labels = align_breaks_labels,
    minor_breaks = align_minor_breaks
  ) +
  scale_y_discrete(
    position = "right",
    expand = c(0.005, 0.0),
    labels = underscore_to_space
  ) +
  labs(
    x = "amino acid position in alignment"
  ) +
  theme_bw() +
  theme(
    legend.position = "none",
    axis.title.y.right = element_blank(),
    plot.margin = margin(0, 0, 0, 0),
    panel.spacing = margin(0, 0, 0, 0),
    axis.text.y = element_text(face = "italic", size = 7)
  ) +
  NULL


(plot_tree_msa <- plot_kappa_casein_tree_clades +
  plot_msa +
  plot_layout(ncol = 2, nrow = 1, widths = c(0.2, 0.7)) +
  NULL
)

save_plot(
  plot = plot_tree_msa,
  name = "figure_S1"
)

Figure S2

# same plot, different data
(plot_compare_evodist_divtime_nocorrect <- plot_compare_evodist_divtime %+%
  (compare_evodist_divtime %>%
    filter(corrected == FALSE)) &
  theme(plot.tag = element_blank()))

save_plot(
  plot = plot_compare_evodist_divtime_nocorrect,
  name = "figure_S2"
)

Figure S3

species_old_indels <- c(
  "Homo_sapiens" = "human",
  "Bos_taurus" = "cow",
  "Tachyglossus_aculeatus" = "echidna",
  "Ornithorhynchus_anatinus" = "platypus",
  "Monodelphis_domestica" = "opossum",
  "Trichosurus_vulpecula" = "possum"
)


# tree
tips_to_drop_old_indels <- trimmed_species_tree_as_df %>%
  filter(isTip, !label %in% names(species_old_indels)) %>%
  pull(node)

tree_old_indels <- trimmed_species_tree %>%
  drop.tip(tips_to_drop_old_indels)

tree_old_indels_df <- tree_old_indels %>%
  ggplot2::fortify()

plot_tree_old_indels <- tree_old_indels_df %>%
  ggtree(size = 0.4) +
  geom_treescale(x = 50, y = 2)

# "barplot"
aa_old_indels_df <- kappa_caseins_alignment_positions %>%
  filter(species %in% names(species_old_indels)) %>%
  arrange(msa_index) %>%
  mutate(is_gap = residue == "-") %>% # remove positions with gaps in **all** selected species
  group_by(msa_index) %>%
  mutate(all_gaps = all(is_gap)) %>%
  ungroup() %>%
  filter(!all_gaps, !is_gap) %>%
  mutate(new_index = group_indices(., msa_index)) %>%
  select(-residue, -all_gaps) %>% # order plot according to the new tree
  mutate_at("species", as.character) %>%
  left_join(tree_old_indels_df, by = c("species" = "label")) %>%
  mutate(species = fct_reorder(species, y)) %>%
  select(species, new_index)

aa_old_indels_df_max_length <- max(aa_old_indels_df$new_index)


plot_bars_old_indels <- aa_old_indels_df %>%
  ggplot(mapping = aes(x = new_index, y = species)) +
  geom_tile(colour = "darkgrey", fill = "black") +
  theme_bw() +
  theme(
    text = element_text(family = "Arial", colour = "black"),
    axis.ticks = element_line(size = 0.2, colour = "black"),
    axis.title.y = element_blank(),
    axis.line.y = element_blank(),
    axis.text.y = element_text(face = "italic", colour = "black"),
    axis.title.x = element_text(colour = "black"),
    axis.text.x = element_text(colour = "black"),
    plot.margin = margin(0, 0, 0, 0),
    panel.background = element_rect(fill = "white", colour = "white"),
    plot.background = element_rect(fill = "white", colour = "white"),
    legend.box.spacing = unit(0.01, "line"),
    legend.key.size = unit(5, "pt"),
    panel.border = element_blank(),
    strip.background = element_blank(),
    plot.subtitle = element_blank()
  ) +
  labs(x = "amino acid position in alignment") +
  scale_x_continuous(
    expand = c(0.02, 0),
    breaks = c(1, seq(50, 300, 50), aa_old_indels_df_max_length),
    minor_breaks = seq(0, 300, 25)
  ) +
  scale_y_discrete(position = "right", labels = underscore_to_space) +
  NULL


(plot_tree_old_indels <- plot_tree_old_indels +
  plot_bars_old_indels +
  plot_layout(widths = c(0.25, 1))
)

save_plot(
  plot = plot_tree_old_indels,
  name = "figure_S3"
)

Figure S4

(aacomp_scatter_plot <- kappa_caseins_parts_aa_composition %>%
  mutate(residues = fct_relevel(residues, list_residues_ordered_aa_comp)) %>%
  ggplot(mapping = aes(x = GMP, y = PKC)) +
  geom_abline(slope = 1, color = "darkgrey", size = 0.1) +
  geom_point(alpha = 0.3, size = 0.1) +
  facet_wrap(~residues, ncol = 5) +
  theme_bw() +
  theme(
    text = element_text(colour = "black"),
    strip.background = element_rect(fill = "white", colour = "black", size = 0),
    strip.text = element_text(size = 9, margin = margin(1, 1, 1, 1, unit = "pt")),
    panel.border = element_rect(size = 0.1),
    panel.spacing.x = unit(1, "lines"),
    axis.text = element_text(colour = "black"),
    axis.ticks = element_line(colour = "black", size = 0.1),
    panel.grid = element_blank(),
    panel.grid.major = element_line(
      size = 0.1,
      linetype = "dotted",
      colour = "black"
    ),
    panel.grid.minor = element_line(
      size = 0.05,
      linetype = "dotted",
      colour = "darkgrey"
    ),
  ) +
  scale_x_continuous(
    limits = c(0, 0.31),
    expand = c(0, 0),
    breaks = seq(0, 0.4, 0.1)
  ) +
  scale_y_continuous(
    limits = c(0, 0.31),
    expand = c(0, 0), breaks = seq(0, 0.4, 0.1)
  ) +
  coord_equal() +
  labs(
    x = "amino acid frequency (GMP)",
    y = "amino acid frequency (PKC)"
  ) +
  NULL
)

save_plot(
  plot = aacomp_scatter_plot,
  name = "figure_S4"
)

Figure S5

# phosphoryaltion predictions
plot_kappa_casein_pred_phospho_positions <- plot_kappa_caseins_parts +
  geom_segment(
    size = 0.1,
    data = kappa_casein_fam20c_matches_tidy %>%
      rename(seq_index = match_position),
    mapping = aes(
      x = seq_index,
      xend = seq_index,
      y = as.integer(species) - 0.5,
      yend = as.integer(species) + 0.5
    ),
    inherit.aes = FALSE
  ) +
  theme(
    legend.position = "none",
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  labs(title = "Phosphorylations") +
  NULL

# O-glycosylation predictions
plot_kappa_casein_pred_oglyc_positions <- plot_kappa_caseins_parts +
  geom_segment(
    size = 0.1,
    data = kappa_casein_oglyc_predictions %>%
      filter(Oglyc_pred) %>%
      rename(seq_index = position),
    mapping = aes(
      x = seq_index,
      xend = seq_index,
      y = as.integer(species) - 0.5,
      yend = as.integer(species) + 0.5
    ),
    inherit.aes = FALSE
  ) +
  labs(title = "O-glycosylations") +
  NULL

plot_phospho_oglyco_positions <- plot_kappa_casein_tree +
  plot_kappa_casein_pred_phospho_positions +
  plot_kappa_casein_pred_oglyc_positions +
  plot_layout(widths = c(1, 4, 4)) &
  theme(plot.margin = margin(0, 0, 0, 0))

save_plot(
  plot = plot_phospho_oglyco_positions,
  name = "figure_S5"
)

Figure S6

cysteines_positions <- kappa_caseins_alignment_positions %>%
  filter(residue == "C") %>%
  select(-msa_index, -residue)

plot_kappa_casein_cysteines <- plot_kappa_caseins_parts +
  geom_segment(
    size = 0.1,
    data = cysteines_positions,
    mapping = aes(
      x = seq_index,
      xend = seq_index,
      y = as.integer(species) - 0.5,
      yend = as.integer(species) + 0.5
    ),
    inherit.aes = FALSE
  ) +
  labs(title = "Cysteines") +
  NULL

(cysteines_position_plot <- {
  plot_kappa_casein_tree +
    plot_kappa_casein_cysteines
} +
  plot_layout(width = c(1, 4)) &
  theme(plot.margin = margin(0, 0, 0, 0))
)

save_plot(
  plot = cysteines_position_plot,
  name = "figure_S6"
)

Table S1

# formating for html report
kappa_casein_gi %>%
  mutate_at("species", fct_relabel, underscore_to_space) %>%
  kable(format = "html") %>%
  kable_styling()
species taxid gi
Tachyglossus aculeatus 9261 255661244
Ornithorhynchus anatinus 9258 255661248
Trichosurus vulpecula 9337 4559294
Monodelphis domestica 13616 126330606
Orycteropus afer 1230840 634853626
Trichechus manatus 127582 NA
Loxodonta africana 9785 731494712
Oryctolagus cuniculus 9986 1607
Ochotona princeps 9978 504173483
Marmota marmota 9994 984137438
Ictidomys tridecemlineatus 43179 532095382
Octodon degus 10160 507713885
Cavia porcellus 10141 115670
Heterocephalus glaber 10181 351699116
Fukomys damarensis 885580 1104935430
Castor canadensis 51338 1147391913
Dipodomys ordii 10020 852759962
Jaculus jaculus 51337 507563375
Microtus ochrogaster 79684 532033496
Peromyscus maniculatus 230844 1008789005
Nannospalax galili 1026970 674032426
Rattus norvegicus 10116 13928762
Mus pahari 10093 1195508802
Mus caroli 10089 1195722227
Mus musculus 10090 75677412
Otolemur garnettii 30611 395857266
Carlito syrichta 1868482 640822240
Aotus nancymaae 37293 817316329
Callithrix jacchus 9483 1060981464
Cebus capucinus 1737458 1044411177
Saimiri boliviensis 39432 403280959
Nomascus leucogenys 61853 332233119
Pongo abelii 9601 297673660
Gorilla gorilla 9595 426344545
Homo sapiens 9606 29676
Pan paniscus 9597 397475232
Pan troglodytes 9598 114594392
Colobus angolensis 336983 795340768
Rhinopithecus roxellana 61622 724803799
Chlorocebus sabaeus 60711 635042818
Papio anubis 9555 402869633
Cercocebus atys 9531 795383260
Mandrillus leucophaeus 9568 795308736
Macaca nemestrina 9545 795623958
Macaca mulatta 9544 109074586
Macaca fascicularis 9541 355749359
Condylura cristata 143302 507942890
Erinaceus europaeus 9365 617605017
Rousettus aegyptiacus 9407 1012258563
Pteropus alecto 9402 586524827
Pteropus vampyrus 132908 759121339
Rhinolophus sinicus 89399 1124008234
Miniopterus natalensis 291302 1016674193
Eptesicus fuscus 29078 641730880
Myotis lucifugus 59463 940768390
Myotis brandtii 109478 946799019
Manis javanica 9974 1048452318
Acinonyx jubatus 32536 961710667
Felis catus 9685 410957476
Panthera pardus 9691 1111079331
Panthera tigris 74533 591321662
Canis lupus 9615 545521723
Ursus maritimus 29073 671001068
Ailuropoda melanoleuca 9646 1126262988
Mustela putorius 9669 859857021
Enhydra lutris 391180 1244101367
Odobenus rosmarus 9708 472346437
Leptonychotes weddellii 9713 585154038
Neomonachus schauinslandi 29088 1212216747
Ceratotherium simum 73337 955478944
Equus asinus 9793 958718360
Equus caballus 9796 19031197
Camelus bactrianus 9837 429534184
Camelus dromedarius 9838 1742992
Lama glama 9844 787034497
Vicugna pacos 30538 560980638
Sus scrofa 9823 55742766
Balaenoptera acutorostrata 310752 594692663
Physeter catodon 9755 593761147
Lipotes vexillifer 118797 602729614
Delphinapterus leucas 9749 1246240428
Tursiops truncatus 9739 470658938
Orcinus orca 9733 466001209
Odocoileus virginianus 9880 1187550739
Cervus nippon 9863 295705
Bubalus bubalis 89462 295701
Bos taurus 9913 1228078
Bison bison 43346 742114810
Bos mutus 72004 440904989
Saiga tatarica 34875 1033241
Pantholops hodgsonii 59538 556777482
Rupicapra rupicapra 34869 1033238
Ovis aries 9940 57164381
Capra hircus 9925 978
Oreamnos americanus 34873 1033235
Naemorhedus goral 34871 1033232
Capricornis crispus 9966 295703
Capricornis swinhoei 34866 1033201
Capricornis sumatraensis 34865 1033203
# formatting for latex and export
kappa_casein_gi %>%
  mutate_at("species", ~ glue("\\textit{<<.x>>}", .open = "<<", .close = ">>")) %>%
  mutate_at("species", ~ fct_recode(.x,
    "\\textit{Ictidomys_tridecemlineatus} \\textsuperscript{b}" = "\\textit{Ictidomys_tridecemlineatus}",
    "\\textit{Fukomys_damarensis} \\textsuperscript{c}" = "\\textit{Fukomys_damarensis}",
    "\\textit{Bos_mutus} \\textsuperscript{h}" = "\\textit{Bos_mutus}",
    "\\textit{Carlito_syrichta} \\textsuperscript{e}" = "\\textit{Carlito_syrichta}",
    "\\textit{Vicugna_pacos} \\textsuperscript{g}" = "\\textit{Vicugna_pacos}",
    "\\textit{Neomonachus_schauinslandi} \\textsuperscript{f}" = "\\textit{Neomonachus_schauinslandi}",
    "\\textit{Nannospalax_galili} \\textsuperscript{d}" = "\\textit{Nannospalax_galili}"
  )) %>%
  mutate_at("species", fct_relabel, underscore_to_space) %>%
  mutate_at("gi", ~ ifelse(is.na(.x), "\\textsuperscript{a}", .x)) %>%
  rename(
    `taxon id` = taxid,
    `GenInfo Identifier` = gi
  ) %>%
  knitr::kable(
    format = "latex",
    booktabs = TRUE,
    linesep = "",
    longtable = TRUE,
    caption.short = "Kappa-casein GenInfo Identifiers",
    label = "gi_species",
    caption = "\\textbf{Kappa-casein GenInfo Identifiers.}",
    escape = F
  ) %>%
  kable_styling(
    latex_options = c(
      "striped",
      "hold_position",
      "full_width",
      "condensed",
      "repeat_header"
    ),
    full_width = TRUE,
    repeat_header_continued = TRUE
  ) %>%
  group_rows("Monotremes", 1, 2) %>%
  group_rows("Marsupials", 3, 4) %>%
  group_rows("Placentals", 5, 99) %>%
  footnote(
    alphabet = c(
      "First exon was recovered from a BLAST with the elephant sequence", # a - Manatee
      "Was ``Spermophilus tridecemlineatus'' in @cite{fritz_2009}", # b - Ictidomys_tridecemlineatus
      "Was ``Cryptomys damarensis'' in @cite{fritz_2009}", # c - Fukomys_damarensis
      "Used instead of ``Spalax ehrenbergi'' in @cite{fritz_2009}", # d - Nannospalax galili
      "Was ``Tarsius syrichta'' in @cite{fritz_2009}''", # e - Carlito_syrichta
      "Was ``Monachus schauinslandi\" in @cite{fritz_2009}''", # f - Neomonachus schauinslandi
      "Was ``Vicugna vicugna'' in @cite{fritz_2009}", # g - Vicugna_pacos
      "Was ``Bos grunniens'' in @cite{fritz_2009}" # h - Bos mutus
    ),
    escape = FALSE
  ) %>%
  str_replace_all("@cite", "\\\\textcite") %>%
  save_table(table = ., name = "table_S1")

Table S2

# formating for html report
kappa.casein.manual.repeats %>%
  mutate_at("species", underscore_to_space) %>%
  kable(format = "html") %>%
  kable_styling()
species N_flank_position C_flank_position sequence
Saiga tatarica 137 142 EAIVNT
Saiga tatarica 143 148 EAIVNT
Saiga tatarica 149 154 EAIVNT
Bos mutus 147 150 EASP
Bos mutus 151 154 EASP
Otolemur garnettii 112 114 PTT
Otolemur garnettii 115 117 PTI
Otolemur garnettii 141 161 PETSSVSAVTNTLEAAAVTVT
Otolemur garnettii 162 182 PEASSVSAITNTLEAAAVTVT
Otolemur garnettii 183 203 PEASSVSAVTNTLEAAAVTVT
Myotis lucifugus 95 131 PSLFAIPPKKNQDKAVIPTANTVPADEPTLIPPSEST
Myotis lucifugus 132 168 PPLFAVPPKKNQDKAVIPIVNTVPADEATLFPPSEST
Myotis lucifugus 169 205 PPLFATPPKKNQDKAVIPTINIIPADEPTVILSSEPT
Myotis brandtii 95 131 PSLFAIPPKKNQDKAVIPTANTVPADEPTLIPPSEST
Myotis brandtii 132 168 PPLIAIPPKKNQDKAVIPIVNTVPADEPTLFPPSEST
Myotis brandtii 169 205 PPLIAIPPKKNQDKAVIPTINIIAADEPTVILSSEPT
Jaculus jaculus 101 112 NADPNASAIPSA
Jaculus jaculus 113 124 NAHPDASAIPSA
Jaculus jaculus 125 136 NAHPDASAIPSP
Cavia porcellus 133 145 SAGDTPEVSSQFI
Cavia porcellus 146 171 DTPDTSVLAEEARESPEDTPEISEFI
Cavia porcellus 172 198 NAPDTAVPSEEPRESAEDTPEISSEFI
Castor canadensis 51 62 INNPYMPYPYYV
Castor canadensis 63 74 ISNPYMSYPYYS
Dipodomys ordii 48 62 INSPYMPFPYYA
Dipodomys ordii 63 69 VNN LPYTYST
Manis javanica 33 35 NSL
Manis javanica 36 38 NSS
Sus scrofa 128 133 EPIVNA
Sus scrofa 134 139 EPIVNA
format_species_ptr_table <- . %>%
  fct_relabel(underscore_to_space) %>%
  map_chr(~ glue("\\textit{<<.x>>}", .open = "<<", .close = ">>")) # italicized

format_sequence_ptr_table <- . %>%
  map_chr(~ str_remove_all(.x, "[ -]")) %>% # remove gaps
  map_chr(~ glue("\\texttt{<<.x>>}", .open = "<<", .close = ">>")) # monospaced

# formatting for latex and export
kappa.casein.manual.repeats %>%
  order_species_tree() %>%
  mutate_at("species", format_species_ptr_table) %>%
  mutate_at("sequence", format_sequence_ptr_table) %>%
  rename(N = N_flank_position, C = C_flank_position) %>%
  knitr::kable(
    format = "latex",
    booktabs = TRUE,
    linesep = "",
    longtable = TRUE,
    caption.short = "Protein tandem repeats found in kappa-casein sequences.",
    label = "supp_ptr",
    caption = "\\textbf{Protein tandem repeats found in kappa-casein sequences.} 
    Positions are given for the extremities of each tandem repeat unit 
    in the mature sequence.",
    escape = FALSE
  ) %>%
  add_header_above(c(" ", "position" = 2, " ")) %>%
  kable_styling(
    latex_options = c(
      "striped",
      "hold_position",
      "full_width",
      "condensed"
    ),
    full_width = TRUE,
    repeat_header_continued = TRUE
  ) %>%
  # footnote() %>%
  column_spec(1, width = "1.5in") %>%
  column_spec(2, width = "0.5in") %>%
  column_spec(3, width = "0.5in") %>%
  save_table(table = ., name = "table_S2")

Table S3

# formating for html report
all_predictions_metrics %>%
  mutate_if(is.numeric, round, digits = 2) %>%
  kable(format = "html") %>%
  kable_styling()
method sensivity specificity precision accuracy MCC
glycomine 0.75 0.95 0.86 0.89 0.72
glycopred 0.94 0.32 0.38 0.51 0.28
O-GlcNAcPRED-II 0.56 0.62 0.39 0.60 0.17
netoglyc 0.56 0.54 0.35 0.55 0.09
# formatting for latex and export
all_predictions_metrics %>%
  mutate_if(is.numeric, round, digits = 2) %>%
  mutate(method = fct_recode(method,
    "GlycoMine" = "glycomine",
    "O-GlcNAcPRED-II" = "O-GlcNAcPRED-II",
    "NetOClyc4.0" = "netoglyc",
    "GlycoPred" = "glycopred"
  )) %>%
  kable(
    format = "latex",
    longtable = TRUE,
    booktabs = TRUE,
    caption.short = "Prediction of O-glycosylation in kappa-casein",
    label = "supp_glyco_pred_methods",
    caption = "\\textbf{Prediction of O-glycosylation in kappa-casein 
    with GlycoMine \\autocite{li_2015a}; 
    GlycoPred \\autocite{hamby_2008}; 
    O-GlcNAcPRED-II \\autocite{jia_2018}; 
    and NetOGlyc4.0 \\autocite{steentoft_2013}.}",
    escape = F
  ) %>%
  kable_styling(latex_options = c("striped", "hold_position", "full_width")) %>%
  save_table(table = ., name = "table_S3")

Session info

  • Date: 2019-02-12
  • OS: Linux Mint 18.3
  • Version: R version 3.4.4 (2018-03-15)
  • Packages: acepack 1.4.1, ade4 1.7-13, ape 5.2, aphid 1.3.1, askpass 1.1, assertthat 0.2.0, backports 1.1.3, badger 0.0.4, base64enc 0.1-3, bindr 0.1.1, bindrcpp 0.2.2, BiocGenerics 0.24.0, Biostrings 2.46.0, broom 0.5.1, cellranger 1.1.0, checkmate 1.8.5, cleaver 1.21.4, cli 1.0.1, cluster 2.0.7-1, codetools 0.2-15, colorspace 1.4-0, crayon 1.3.4, data.table 1.12.0, digest 0.6.18, dlstats 0.1.0, dplyr 0.7.8, evaluate 0.12, farver 1.1.0, fastmatch 1.1-0, forcats 0.3.0, foreign 0.8-71, formatR 1.5, Formula 1.2-3, generics 0.0.2, geoscale 2.0, ggforce 0.1.3, ggplot2 3.1.0, ggpmisc 0.3.0, ggstance 0.3.1, ggthemes 4.0.1, ggtree 1.15.2, glue 1.3.0, gridExtra 2.3, gtable 0.2.0, haven 2.0.0, here 0.1, highr 0.7, Hmisc 4.1-1, hms 0.4.2, htmlTable 1.12, htmltools 0.3.6, htmlwidgets 1.3, httr 1.4.0, igraph 1.2.2, IRanges 2.12.0, jsonlite 1.6, kableExtra 0.9.0, kmer 1.1.0, knitr 1.21.7, labeling 0.3, lattice 0.20-38, latticeExtra 0.6-28, lazyeval 0.2.1, lubridate 1.7.4, magrittr 1.5, MASS 7.3-51.1, Matrix 1.2-14, modelr 0.1.2, munsell 0.5.0, nlme 3.1-137, nnet 7.3-12, openssl 1.2.1, patchwork 0.0.1, Peptides 2.4, phangorn 2.5.2, phylogram 2.1.0, pillar 1.3.1, pkgconfig 2.0.2, plyr 1.8.4, ProjectTemplate 0.8.2, purrr 0.2.5, quadprog 1.5-5, R6 2.3.0, RColorBrewer 1.1-2, Rcpp 1.0.0, readr 1.3.0, readxl 1.2.0, rlang 0.3.1, rmarkdown 1.11, rpart 4.1-13, rprojroot 1.3-2, rstudioapi 0.9.0, rvcheck 0.1.3, rvest 0.3.2, S4Vectors 0.16.0, scales 1.0.0, seqinr 3.4-5, sessioninfo 1.1.1, stringi 1.2.4, stringr 1.3.1, survival 2.43-3, tibble 2.0.1, tidyr 0.8.2, tidyselect 0.2.5, tidytree 0.2.1, tidyverse 1.2.1, treeio 1.7.1, tweenr 1.0.1, units 0.6-2, viridisLite 0.3.0, withr 2.1.2.9000, xfun 0.4, XML 3.99-0, xml2 1.2.0, XVector 0.18.0, yaml 2.2.0, zlibbioc 1.24.0